#!/bin/bash

# Set up CoNLL-2003 train/test/dev data
export data_name="ontonotes"
export raw_data_dir="$DATA_DIR/conll-2012/v4/data"
export data_files=( "train" "development" "test" )
# todo get this number automagically
export max_sent_len=126
export process_script="$DILATED_CNN_NER_ROOT/src/tsv_to_tfrecords.py"

export data_dir="$processed_data_dir/$data_name-w$filter_width-$embeddings_name"

if [[ "$start_end" == "true" ]]; then
    export data_dir="$data_dir-start_end"
fi

if [[ "$predict_pad" == "true" ]]; then
    export data_dir="$data_dir-pred_pad"
fi

if [[ "$documents" == "true" ]]; then
    export data_dir="$data_dir-docs"
fi

export maps_dir="$data_dir/train"

export vocab_cutoff=4
export update_vocab_file="$DILATED_CNN_NER_ROOT/data/vocabs/${data_name}_cutoff_${vocab_cutoff}.txt"

export train_dir="$data_dir/train/protos/\*_examples.proto"
export dev_dir="$data_dir/dev/protos/\*_examples.proto"
export test_dir="$data_dir/test/protos/\*_examples.proto"

